load libraries and define functions
library(DESeq2)
## Loading required package: S4Vectors
## Loading required package: stats4
## Loading required package: BiocGenerics
##
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:stats':
##
## IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
##
## anyDuplicated, append, as.data.frame, basename, cbind, colnames,
## dirname, do.call, duplicated, eval, evalq, Filter, Find, get, grep,
## grepl, intersect, is.unsorted, lapply, Map, mapply, match, mget,
## order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
## rbind, Reduce, rownames, sapply, setdiff, sort, table, tapply,
## union, unique, unsplit, which.max, which.min
##
## Attaching package: 'S4Vectors'
## The following objects are masked from 'package:base':
##
## expand.grid, I, unname
## Loading required package: IRanges
## Loading required package: GenomicRanges
## Loading required package: GenomeInfoDb
## Loading required package: SummarizedExperiment
## Loading required package: MatrixGenerics
## Loading required package: matrixStats
##
## Attaching package: 'MatrixGenerics'
## The following objects are masked from 'package:matrixStats':
##
## colAlls, colAnyNAs, colAnys, colAvgsPerRowSet, colCollapse,
## colCounts, colCummaxs, colCummins, colCumprods, colCumsums,
## colDiffs, colIQRDiffs, colIQRs, colLogSumExps, colMadDiffs,
## colMads, colMaxs, colMeans2, colMedians, colMins, colOrderStats,
## colProds, colQuantiles, colRanges, colRanks, colSdDiffs, colSds,
## colSums2, colTabulates, colVarDiffs, colVars, colWeightedMads,
## colWeightedMeans, colWeightedMedians, colWeightedSds,
## colWeightedVars, rowAlls, rowAnyNAs, rowAnys, rowAvgsPerColSet,
## rowCollapse, rowCounts, rowCummaxs, rowCummins, rowCumprods,
## rowCumsums, rowDiffs, rowIQRDiffs, rowIQRs, rowLogSumExps,
## rowMadDiffs, rowMads, rowMaxs, rowMeans2, rowMedians, rowMins,
## rowOrderStats, rowProds, rowQuantiles, rowRanges, rowRanks,
## rowSdDiffs, rowSds, rowSums2, rowTabulates, rowVarDiffs, rowVars,
## rowWeightedMads, rowWeightedMeans, rowWeightedMedians,
## rowWeightedSds, rowWeightedVars
## Loading required package: Biobase
## Welcome to Bioconductor
##
## Vignettes contain introductory material; view with
## 'browseVignettes()'. To cite Bioconductor, see
## 'citation("Biobase")', and for packages 'citation("pkgname")'.
##
## Attaching package: 'Biobase'
## The following object is masked from 'package:MatrixGenerics':
##
## rowMedians
## The following objects are masked from 'package:matrixStats':
##
## anyMissing, rowMedians
library(limma)
##
## Attaching package: 'limma'
## The following object is masked from 'package:DESeq2':
##
## plotMA
## The following object is masked from 'package:BiocGenerics':
##
## plotMA
library(EnhancedVolcano)
## Loading required package: ggplot2
## Loading required package: ggrepel
## Registered S3 methods overwritten by 'ggalt':
## method from
## grid.draw.absoluteGrob ggplot2
## grobHeight.absoluteGrob ggplot2
## grobWidth.absoluteGrob ggplot2
## grobX.absoluteGrob ggplot2
## grobY.absoluteGrob ggplot2
library(AnnotationDbi)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:AnnotationDbi':
##
## select
## The following object is masked from 'package:Biobase':
##
## combine
## The following object is masked from 'package:matrixStats':
##
## count
## The following objects are masked from 'package:GenomicRanges':
##
## intersect, setdiff, union
## The following object is masked from 'package:GenomeInfoDb':
##
## intersect
## The following objects are masked from 'package:IRanges':
##
## collapse, desc, intersect, setdiff, slice, union
## The following objects are masked from 'package:S4Vectors':
##
## first, intersect, rename, setdiff, setequal, union
## The following objects are masked from 'package:BiocGenerics':
##
## combine, intersect, setdiff, union
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(msigdbr)
library(clusterProfiler)
##
## clusterProfiler v4.2.2 For help: https://yulab-smu.top/biomedical-knowledge-mining-book/
##
## If you use clusterProfiler in published research, please cite:
## T Wu, E Hu, S Xu, M Chen, P Guo, Z Dai, T Feng, L Zhou, W Tang, L Zhan, X Fu, S Liu, X Bo, and G Yu. clusterProfiler 4.0: A universal enrichment tool for interpreting omics data. The Innovation. 2021, 2(3):100141
##
## Attaching package: 'clusterProfiler'
## The following object is masked from 'package:AnnotationDbi':
##
## select
## The following object is masked from 'package:IRanges':
##
## slice
## The following object is masked from 'package:S4Vectors':
##
## rename
## The following object is masked from 'package:stats':
##
## filter
library(scales)
library(ggpubr)
library(ggplot2)
library(enrichplot)
##
## Attaching package: 'enrichplot'
## The following object is masked from 'package:ggpubr':
##
## color_palette
library(factoextra)
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(pheatmap)
library(org.Hs.eg.db)
##
library(xCell)
library(janitor)
##
## Attaching package: 'janitor'
## The following objects are masked from 'package:stats':
##
## chisq.test, fisher.test
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:AnnotationDbi':
##
## select
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:IRanges':
##
## slice
## The following object is masked from 'package:S4Vectors':
##
## rename
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
# function to read experiment designs from our datasets
read_design <- function(filename) {
E <- read.csv(filename, sep = "\t")
rownames(E) <- E$Run
E <- E[c("Sample.Characteristic.disease.", "Sample.Characteristic.organism.part.")]
colnames(E) <- c("Diagnosis", "Brodmann")
E[E == "bipolar disorder"] = "bipolar"
E$`Brodmann` <- sub("Brodmann \\(1909\\) area ", "", E$`Brodmann`)
E[E == "dorsolateral prefrontal cortex"] = "46"
return(E)
}
Load Data
# read the experiments' designs
E7 <- read_design("../Databases/Bipolar - E-GEOD-78936/E-GEOD-78936-experiment-design.tsv")
E5 <- read_design("../Databases/Bipolar - E-GEOD-53239/E-GEOD-53239-experiment-design.tsv")
# add identifiers for each dataset
E5 <- cbind(rep("GEOD-53239", times=nrow(E5)), E5)
E7 <- cbind(rep("GEOD-78936", times=nrow(E7)), E7)
colnames(E5)[1] = colnames(E7)[1] = "Dataset"
# combine metadata
metadata <- rbind(E5, E7)
# read counts data
data1 <- read.delim("../Databases/Bipolar - E-GEOD-53239/E-GEOD-53239-raw-counts.tsv")
data2 <- read.delim("../Databases/Bipolar - E-GEOD-78936/E-GEOD-78936-raw-counts.tsv")
# combine only the counts data
counts <- cbind(data1[-1][-1], data2[-1][-1])
counts <- counts[sort(colnames(counts))]
# take the gene names from the counts data
genes.symbols <- data1[,2]
# if there is no gene name, use the ID
genes.symbols[genes.symbols == ""] <- data1$Gene.ID[genes.symbols == ""]
# replace dupliactes with ID too
genes.symbols[duplicated(genes.symbols)] <- data1$Gene.ID[duplicated(genes.symbols)]
Run DESeq2
# run DESeq2 on combined data from both experiments
# using Dataset to distinguish between them which will remove the batch effect.
# Since Dataset is a division contained in Brodmann, using Brodmann is enough
dds <- DESeqDataSetFromMatrix(countData=counts, colData=metadata, design= ~Brodmann + Diagnosis)
## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
## design formula are characters, converting to factors
dds <- DESeq(dds)
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
## -- replacing outliers and refitting for 18 genes
## -- DESeq argument 'minReplicatesForReplace' = 7
## -- original counts are preserved in counts(dds)
## estimating dispersions
## fitting model and testing
rownames(dds) <- genes.symbols
# run different DESeq2 for each brain area
areas <- c(9, 11, 24, 46)
areaDds <- list()
for (i in seq_along(areas)){
areaDds[[i]] <- DESeqDataSetFromMatrix(countData=counts[,metadata$Brodmann == areas[i]], colData=metadata[metadata$Brodmann == areas[i],], design= ~Diagnosis)
areaDds[[i]] <- DESeq(areaDds[[i]])
}
## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
## design formula are characters, converting to factors
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
## -- replacing outliers and refitting for 92 genes
## -- DESeq argument 'minReplicatesForReplace' = 7
## -- original counts are preserved in counts(dds)
## estimating dispersions
## fitting model and testing
## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
## design formula are characters, converting to factors
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
## -- replacing outliers and refitting for 45 genes
## -- DESeq argument 'minReplicatesForReplace' = 7
## -- original counts are preserved in counts(dds)
## estimating dispersions
## fitting model and testing
## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
## design formula are characters, converting to factors
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
## -- replacing outliers and refitting for 15 genes
## -- DESeq argument 'minReplicatesForReplace' = 7
## -- original counts are preserved in counts(dds)
## estimating dispersions
## fitting model and testing
## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
## design formula are characters, converting to factors
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
## -- replacing outliers and refitting for 54 genes
## -- DESeq argument 'minReplicatesForReplace' = 7
## -- original counts are preserved in counts(dds)
## estimating dispersions
## fitting model and testing
Analyze the data:
# do logfold change for Diagnosis
# normal vs bipolar
resLFC.bipolar.normal <- lfcShrink(dds, contrast = c("Diagnosis", "bipolar", "normal"), type = "ashr")
## using 'ashr' for LFC shrinkage. If used in published research, please cite:
## Stephens, M. (2016) False discovery rates: a new deal. Biostatistics, 18:2.
## https://doi.org/10.1093/biostatistics/kxw041
resLFC.bipolar.normal["symbol"] <- genes.symbols
resOrdered.bipolar.normal <- resLFC.bipolar.normal[order(resLFC.bipolar.normal$pvalue),]
# bipolar vs schizophrenia
resLFC.bipolar.schizo <- lfcShrink(dds, contrast = c("Diagnosis", "bipolar", "schizophrenia"), type = "ashr")
## using 'ashr' for LFC shrinkage. If used in published research, please cite:
## Stephens, M. (2016) False discovery rates: a new deal. Biostatistics, 18:2.
## https://doi.org/10.1093/biostatistics/kxw041
resLFC.bipolar.schizo["symbol"] <- genes.symbols
resOrdered.bipolar.schizo <- resLFC.bipolar.schizo[order(resLFC.bipolar.schizo$pvalue),]
# differentiate brain areas
resLFC.areas <- list()
resOrdered.areas <- list()
comparisons <- list()
j <- 1
for (i in seq_along(areas)){
controls = c("normal", "schizophrenia")
if (areas[i] == 46){
# area 46 has no SZ patients
controls = controls[1]
}
for (control in controls){
comparison <- paste0("bipolar vs ", control, " in ", areas[i])
print(paste0("Calculating ", comparison))
resLFC.areas[[j]] <- lfcShrink(areaDds[[i]], contrast = c("Diagnosis", "bipolar", control), type = "ashr")
resLFC.areas[[j]]["symbol"] <- genes.symbols
resOrdered.areas[[j]] <- resLFC.areas[[j]][order(resLFC.areas[[j]]$pvalue),]
comparisons[[j]] <- comparison
j = j + 1
}
}
## [1] "Calculating bipolar vs normal in 9"
## using 'ashr' for LFC shrinkage. If used in published research, please cite:
## Stephens, M. (2016) False discovery rates: a new deal. Biostatistics, 18:2.
## https://doi.org/10.1093/biostatistics/kxw041
## [1] "Calculating bipolar vs schizophrenia in 9"
## using 'ashr' for LFC shrinkage. If used in published research, please cite:
## Stephens, M. (2016) False discovery rates: a new deal. Biostatistics, 18:2.
## https://doi.org/10.1093/biostatistics/kxw041
## [1] "Calculating bipolar vs normal in 11"
## using 'ashr' for LFC shrinkage. If used in published research, please cite:
## Stephens, M. (2016) False discovery rates: a new deal. Biostatistics, 18:2.
## https://doi.org/10.1093/biostatistics/kxw041
## [1] "Calculating bipolar vs schizophrenia in 11"
## using 'ashr' for LFC shrinkage. If used in published research, please cite:
## Stephens, M. (2016) False discovery rates: a new deal. Biostatistics, 18:2.
## https://doi.org/10.1093/biostatistics/kxw041
## [1] "Calculating bipolar vs normal in 24"
## using 'ashr' for LFC shrinkage. If used in published research, please cite:
## Stephens, M. (2016) False discovery rates: a new deal. Biostatistics, 18:2.
## https://doi.org/10.1093/biostatistics/kxw041
## [1] "Calculating bipolar vs schizophrenia in 24"
## using 'ashr' for LFC shrinkage. If used in published research, please cite:
## Stephens, M. (2016) False discovery rates: a new deal. Biostatistics, 18:2.
## https://doi.org/10.1093/biostatistics/kxw041
## [1] "Calculating bipolar vs normal in 46"
## using 'ashr' for LFC shrinkage. If used in published research, please cite:
## Stephens, M. (2016) False discovery rates: a new deal. Biostatistics, 18:2.
## https://doi.org/10.1093/biostatistics/kxw041
# normalize our data
dds.norm <- vst(dds, blind=F)
rownames(dds.norm) <- genes.symbols
# remove batch effect for PCA
mat <- assay(dds.norm)
mat <- removeBatchEffect(mat, dds.norm$Dataset)
assay(dds.norm) <- mat
Make some nice looking graphs :)
# create volcano plots
EnhancedVolcano(resOrdered.bipolar.normal, lab = resOrdered.bipolar.normal$symbol, x = 'log2FoldChange', y = 'padj', FCcutoff=2, pCutoff = 10e-10, title="bipolar vs normal")
EnhancedVolcano(resOrdered.bipolar.schizo, lab = resOrdered.bipolar.schizo$symbol, x = 'log2FoldChange', y = 'padj', FCcutoff=2, pCutoff = 10e-10, title="bipolar vs schizophrenia")
for (i in seq_along(comparisons)){
print(EnhancedVolcano(resOrdered.areas[[i]], lab = resOrdered.areas[[i]]$symbol, x = 'log2FoldChange', y = 'padj', FCcutoff=2, pCutoff = 10e-10, title=comparisons[i]))
}
## Warning: Removed 2 rows containing missing values (geom_vline).
## Warning: Removed 1 rows containing missing values (geom_hline).
## Warning: Removed 2 rows containing missing values (geom_vline).
## Removed 1 rows containing missing values (geom_hline).
## Warning: Removed 2 rows containing missing values (geom_vline).
## Removed 1 rows containing missing values (geom_hline).
## Warning: Removed 2 rows containing missing values (geom_vline).
## Removed 1 rows containing missing values (geom_hline).
## Warning: Removed 1 rows containing missing values (geom_vline).
## Removed 1 rows containing missing values (geom_hline).
## Warning: Removed 1 rows containing missing values (geom_vline).
## Removed 1 rows containing missing values (geom_hline).
# make some pca plots
plotPCA(dds.norm, intgroup=c("Diagnosis"))
plotPCA(dds.norm, intgroup=c("Brodmann"))
plotPCA(dds.norm, intgroup=c("Diagnosis", "Brodmann"))
Some count plots for significant genes :)
# significant genes to check in violin plots
genes <- c("LINC02340", "MTND6P4", "MT1X", "IL1RL1")
# compare the means of each two groups
means_comp <- list(c("bipolar", "normal"), c("bipolar", "schizophrenia"), c("normal", "schizophrenia"))
# make the violin plots
for (gene in genes) {
gene.index <- which(genes.symbols %in% gene)
plot <- plotCounts(dds, gene = rownames(dds)[gene.index], intgroup = c('Diagnosis'), xlab="Diagnosis", title=paste0(gene, " counts"), returnData = T)
print(
ggplot(plot, aes(x = Diagnosis, y = count, fill=Diagnosis)) +
geom_violin(draw_quantiles = 0.5) +
geom_jitter(color="royalblue", size=0.6, alpha=0.5) +
theme(legend.position="none") +
ggtitle(paste0("Gene Expression of ", gene)) +
scale_y_continuous(trans = log10_trans()) +
stat_compare_means(label = "p.signif", comparisons = means_comp, method = "wilcox.test")
)
}
# significant genes in single areas to check in violin plots
areaSpecific <- data.frame(GeneName = c("IGKC", "CHI3L2", "MT1X", "MTND6P4"), Brodmann = c(46, 46, 46, 46))
# make the violin plots
for (geneIndex in 1:nrow(areaSpecific)) {
area <- areaSpecific[geneIndex,]$Brodmann
gene <- areaSpecific[geneIndex,]$GeneName
thisDds = areaDds[[which(areas == area)]]
rownames(thisDds) <- genes.symbols
plot <- plotCounts(thisDds, gene = gene, intgroup = c('Diagnosis'), xlab="Diagnosis", title=paste0(gene, " counts"), returnData = T)
comp = means_comp
if (area == 46){
# area 46 has no SZ patients
comp = means_comp[1]
}
print(
ggplot(plot, aes(x = Diagnosis, y = count, fill=Diagnosis)) +
scale_fill_manual(values=c("#f8766d", "#00ba38", "#619cff")) +
geom_violin(draw_quantiles = 0.5) +
geom_jitter(color="royalblue", size=0.6, alpha=0.5) +
theme(legend.position="none") +
ggtitle(paste0("Gene Expression of ", gene, " in ", area)) +
scale_y_continuous(trans = log10_trans()) +
stat_compare_means(label = "p.signif", comparisons = comp, method = "wilcox.test")
)
}
## Warning in wilcox.test.default(c(0.277745867479051, -0.301029995663981, : cannot
## compute exact p-value with ties
## Warning in wilcox.test.default(c(-0.301029995663981, 0.359390413188311, : cannot
## compute exact p-value with ties
## Warning in wilcox.test.default(c(0.517353315321231, 0.359390413188311,
## 0.314141856807374, : cannot compute exact p-value with ties
And Pathway analysis :)
# remove NAs and create sorted genes list with LFC values and ENSMBLE ID as names
DE_genes_entrez_rank <- resOrdered.bipolar.normal[!is.na(resOrdered.bipolar.normal$padj),]
DE_genes_list <- DE_genes_entrez_rank$log2FoldChange
names(DE_genes_list) <- DE_genes_entrez_rank$symbol
DE_genes_list <- sort(DE_genes_list, decreasing=TRUE)
# collect hallmarks
hallmarks <- msigdbr(species = "Homo sapiens", category = "H") %>% dplyr::select(gs_name, gene_symbol)
# set nPerm to million so it's absolutely precise and does not change when the random seed changes
hm <- GSEA(DE_genes_list, TERM2GENE=hallmarks, nPerm=1000000)
## preparing geneSet collections...
## GSEA analysis...
## Warning in .GSEA(geneList = geneList, exponent = exponent, minGSSize =
## minGSSize, : We do not recommend using nPerm parameter incurrent and future
## releases
## Warning in fgsea(pathways = geneSets, stats = geneList, nperm = nPerm, minSize
## = minGSSize, : You are trying to run fgseaSimple. It is recommended to use
## fgseaMultilevel. To run fgseaMultilevel, you need to remove the nperm argument
## in the fgsea function call.
## leading edge analysis...
## done...
# create a ridgeplot
ridgeplot(hm) + labs(x = "enrichment distribution")
## Picking joint bandwidth of 0.0774
# and a dot plot
dotplot(hm, showCategory = 178, title="Pathway Analisys with GSEA, normal VS bipolar")
# We tried running the same code on resOrdered.bipolar.schizo, but it found no enriched terms
Clustering, no interesting data
# get normalized counts only for bipolar
sampleFilter <- dds$Diagnosis != "schizophrenia"
normcounts <- assay(vst(dds[,sampleFilter], blind=T))
colnames(normcounts) <- paste0(dds$Diagnosis[sampleFilter], seq(1, ncol(normcounts)))
var_per_gene <- apply(normcounts, 1, var)
# take 1000 most significant genes
selectedGenes <- names(var_per_gene[order(var_per_gene, decreasing = T)][1:20])
selectedGenes <- genes
normcounts.topGenes <- t(normcounts[selectedGenes,])
# make an elbow plot (optimal k seems to be 2 or 3)
fviz_nbclust(normcounts.topGenes, FUN = hcut, method = "wss")
dist_mat <- dist(normcounts.topGenes, method = 'euclidean')
# plot the cluster dendrogram
hclust_avg <- hclust(dist_mat, method = 'average')
plot(hclust_avg, cex = 0.6, hang = -1, xlab="", ylab ="", sub="", axes=F)
rect.hclust(hclust_avg, k = 3, border = 2:8)
Heatmap
# take normalized counts for normal and bipolar
sampleFilter <- dds$Diagnosis != "schizophrenia"
normcounts <- assay(vst(dds[,sampleFilter], blind=T))
# create labels for heatmap
df <- data.frame(row.names = colnames(dds[,sampleFilter]),
diagnosis = colData(dds[,sampleFilter])$Diagnosis,
area = colData(dds[,sampleFilter])$Brodmann)
# Take top 10 genes with the lowest p-value that express in Bipolar (log2FoldChange>0)
selectUp <- resOrdered.bipolar.normal$symbol[resOrdered.bipolar.normal$log2FoldChange>0][1:10]
# Take top 10 genes with the lowest p-value that express in Healthy (log2FoldChange<0)
selectDown <- resOrdered.bipolar.normal$symbol[resOrdered.bipolar.normal$log2FoldChange<0][1:10]
selected <- c(selectUp,selectDown)
# create the heatmap
pheatmap(normcounts[selected,], cluster_rows=TRUE,
show_colnames = FALSE, cluster_cols=TRUE,
annotation_col=df, scale = 'row', cutree_cols = 2, cutree_rows = 2)
# set rownames of counts so xCell understands what are the genes
rownames(counts) <- genes.symbols
# run the analysis
counts.xCell <- xCellAnalysis(counts, rnaseq=T)
## [1] "Num. of genes: 10535"
## Setting parallel calculations through a MulticoreParam back-end
## with workers=4 and tasks=100.
## Estimating ssGSEA scores for 489 gene sets.
##
|
|======================================================================| 100%
p <- pheatmap(counts.xCell, show_colnames = F, silent = T)
# filter results as most of it is not valuble information
counts.xCell.filtered <- counts.xCell[rowMeans(counts.xCell) > 0.01,]
# scale for heatmap, avarage should be 0
counts.xCell.filtered.scaled <- t(scale(t(counts.xCell.filtered)))
# clip values
counts.xCell.filtered.scaled.clipped <- counts.xCell.filtered.scaled
counts.xCell.filtered.scaled.clipped[counts.xCell.filtered.scaled > 1] <- 1
counts.xCell.filtered.scaled.clipped[counts.xCell.filtered.scaled < -1] <- -1
# clustering metadata for heatmap
df <- data.frame(row.names = colnames(dds),
diagnosis = colData(dds)$Diagnosis,
area = colData(dds)$Brodmann)
# show results
pheatmap(counts.xCell.filtered.scaled.clipped, cluster_rows=TRUE,
show_colnames = FALSE, cluster_cols=TRUE,
annotation_col=df, scale = 'row', cutree_cols = 4, cutree_rows = 2)
# analyze clustering
k = 2
myclusters <- cutree(p$tree_col, k=k)
# show correlation between clusters and known factors about the patients
patients <- table(metadata$Diagnosis, myclusters)
mosaicplot(patients)
patients <- table(metadata$Brodmann, myclusters)
mosaicplot(patients)
# create every combination of clusters to see difference in the data
comps <- t(combn(1:k, 2))
comps <- split(comps, seq(nrow(comps)))
# violin plots of cell score in every cluster
for (cellType in rownames(counts.xCell.filtered)) {
MEscore.df <- data.frame("score" = counts.xCell[cellType,],
"cluster" = factor(myclusters),
row.names = names(myclusters))
print(
ggplot(MEscore.df, aes(x=cluster, y=score, fill=cluster)) +
geom_violin(draw_quantiles = 0.5) +
labs(x="", y="Score") + stat_compare_means(method = "wilcox.test", comparisons = comps, label = "p.signif") +
ggtitle(paste0("Cluster comparison of ", cellType))
)
}
## Warning in regularize.values(x, y, ties, missing(ties), na.rm = na.rm):
## collapsing to unique 'x' values
## Warning in regularize.values(x, y, ties, missing(ties), na.rm = na.rm):
## collapsing to unique 'x' values
# see how the cell type divide the groups on a plot
chosenCells = c("Th1 cells", "Smooth muscle", "MSC")
scores.df <- data.frame("cluster" = factor(myclusters),
row.names = names(myclusters),
area=paste0(metadata$Brodmann, ":", factor(myclusters)))
# create a
for (cell in chosenCells) {
scores.df[cell] <- counts.xCell[cell,]
}
# create a list of every two cell type combination from choseCells
comps <- t(combn(1:length(chosenCells), 2))
comps <- split(comps, seq(nrow(comps)))
# see how well each two cell types divide the patients to the clusters
for (comp in comps){
print(
ggplot(scores.df, aes(x=scores.df[,chosenCells[comp[1]]], y=scores.df[,chosenCells[comp[2]]], color=cluster)) +
geom_point() +
xlab(chosenCells[comp[1]]) +
ylab(chosenCells[comp[2]])
)
}
Overall, we can see that the division is quite good, and that there is some kind of difference between the clusters, but nothing that is expressed in other mediacal data we have